Predicting whether a patient has diabetes
This is a practice on the PIMA Indian diabetes dataset, to predict whether or not a patient has diabetes.
I followed Rebecca Barter’s blog post quite closely for this exercise.
data("PimaIndiansDiabetes2")
diabetes_orig <- PimaIndiansDiabetes2
Notes from the mlbench package:
The data set PimaIndiansDiabetes2 contains a corrected version of the original data set. While the UCI repository index claims that there are no missing values, closer inspection of the data shows several physical impossibilities, e.g., blood pressure or body mass index of 0. In PimaIndiansDiabetes2, all zero values of glucose, pressure, triceps, insulin and mass have been set to NA.
glimpse(diabetes_orig) # 768 x 9
Rows: 768
Columns: 9
$ pregnant <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, …
$ glucose <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110,…
$ pressure <dbl> 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80,…
$ triceps <dbl> 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA,…
$ insulin <dbl> NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, N…
$ mass <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.…
$ pedigree <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.…
$ age <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57,…
$ diabetes <fct> pos, neg, pos, neg, pos, neg, pos, neg, pos, pos, n…
Goal: To predict diabetes (Y) from the other 8 numerical variables (X):
pregnant - Number of times pregnant glucose - Plasma glucose concentration (glucose tolerance test) pressure - Diastolic blood pressure (mm Hg) triceps - Triceps skin fold thickness (mm) insulin - 2-Hour serum insulin (mu U/ml) mass - Body mass index (weight in kg/(height in m)^2) pedigree - Diabetes pedigree function age - Age (years)
pregnant glucose pressure triceps insulin mass pedigree
0 5 35 227 374 11 0
age diabetes
0 0
diabetes_orig %>%
count(diabetes) %>% # not evenly distributed
ggplot(aes(diabetes, n)) +
geom_col(aes(fill = diabetes), show.legend = F) +
scale_fill_jco() +
labs(title = "Distribution of Y variable - No. of patients with diabetes") +
theme_clean()
diabetes_orig %>%
ggpairs(aes(col = diabetes, fill = diabetes)) +
scale_color_jco() +
scale_fill_jco() +
theme_clean()
Skewed: - pregnant - glucose - triceps - insulin - mass - pedigree - age
Not skewed: - pressure
Almost all the x variables are skewed.
Some correlation for: - number of times pregnant and age - insulin and glucose concentration - triceps thickness and bmi
# 1 plot
median_df <- diabetes_orig %>%
group_by(diabetes) %>%
summarise(median = median(pregnant))
diabetes_orig %>%
ggplot(aes(diabetes, pregnant)) +
geom_boxplot(aes(fill = diabetes), show.legend = F) +
geom_text(data = median_df, aes(x = diabetes,
y = median,
label = median),
vjust = -1, size = 5) +
scale_fill_jco() +
labs(title = "Pregnant") +
theme_clean()
# create function
plot_boxplots_against_outcome <- function(var_y){
median_df <- diabetes_orig %>%
group_by(diabetes) %>%
summarise(median = median({{var_y}}, na.rm = T)) # must remove to calculate
diabetes_orig %>%
ggplot(aes(diabetes, {{var_y}})) +
geom_boxplot(aes(fill = diabetes), show.legend = F) +
geom_text(data = median_df, aes(x = diabetes,
y = median,
label = median),
vjust = -0.5, size = 4, fontface = "bold") +
scale_fill_jco() +
labs(title = str_to_title(
as_label(enquo(var_y)))) +
theme_clean()
}
# set names to loop through
num_var <- diabetes_orig %>%
select(-diabetes) %>%
names()
num_var %>%
syms() %>% # take strings as input and turn them into symbols
map(function(var) plot_boxplots_against_outcome({{var}})) %>%
patchwork::wrap_plots()
People with diabetes tend to have: - lower number of times pregnant - lower blood glucose - lower triceps value - lower bmi - lower pedigree - lower age
diabetes_orig %>%
ggplot(aes(insulin)) +
geom_freqpoly(aes(col = diabetes), size = 2) +
labs(title = "Insulin") +
theme_clean() +
scale_color_jco()
# a function
plot_freqpoly <- function(var_x) {
diabetes_orig %>%
ggplot(aes({{var_x}})) +
geom_freqpoly(aes(col = diabetes), size = 2) +
labs(title = str_to_title(as_label(enquo(var_x)))) +
theme_clean() +
scale_color_jco()
}
# map
num_var %>%
syms() %>% # take strings as input and turn them into symbols
map(function(var) plot_freqpoly({{var}})) %>%
patchwork::wrap_plots()
This is done so that the reference level is Positive (instead of alphabetical order).
diabetes <- diabetes_orig %>%
mutate(diabetes = fct_relevel(diabetes,
"pos", "neg"))
glimpse(diabetes)
Rows: 768
Columns: 9
$ pregnant <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, …
$ glucose <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110,…
$ pressure <dbl> 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80,…
$ triceps <dbl> 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA,…
$ insulin <dbl> NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, N…
$ mass <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.…
$ pedigree <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.…
$ age <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57,…
$ diabetes <fct> pos, neg, pos, neg, pos, neg, pos, neg, pos, pos, n…
set.seed(2021110401)
diabetes_split <- initial_split(diabetes, prop = 0.75,
strata = "diabetes")
diabetes_train <- training(diabetes_split)
diabetes_test <- testing(diabetes_split)
diabetes_cv <- vfold_cv(diabetes_train)
diabetes_recipe <-
recipe(diabetes ~ ., data = diabetes) %>%
step_impute_median(all_predictors()) %>%
step_log(pregnant, glucose, insulin, triceps, mass, pedigree, age,
offset = 1) %>%
step_normalize(all_numeric()) %>%
themis::step_smote(diabetes) # class imbalance
diabetes_recipe
Recipe
Inputs:
role #variables
outcome 1
predictor 8
Operations:
Median Imputation for all_predictors()
Log transformation on pregnant, glucose, insulin, triceps,...
Centering and scaling for all_numeric()
SMOTE based on diabetes
diabetes_preprocessed <- diabetes_recipe %>%
prep(diabetes_train) %>%
bake(new_data = NULL) # returns the training dataset
diabetes_preprocessed
# A tibble: 750 × 9
pregnant glucose pressure triceps insulin mass pedigree age
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -0.806 -1.15 -0.566 -0.601 -0.541 -0.578 -1.03 -1.23
2 0.614 -0.0790 0.125 0.201 0.00226 -1.01 -0.891 -0.146
3 1.40 -0.114 -0.0481 0.201 0.00226 0.488 -1.18 -0.250
4 0.378 -0.293 1.68 0.201 0.00226 0.784 -0.933 -0.146
5 1.40 0.650 0.642 0.201 0.00226 -0.747 2.66 1.84
6 -0.806 -0.558 -3.67 0.919 -0.788 1.45 -0.966 0.148
7 0.0898 0.254 1.33 1.15 1.29 0.992 0.860 -0.469
8 1.14 -0.717 0.987 0.201 0.00226 0.501 -0.167 1.43
9 -0.806 -0.799 -0.566 -1.87 0.252 -1.47 0.178 -1.09
10 1.71 0.821 0.815 -1.17 -0.228 -1.67 -0.711 1.84
# … with 740 more rows, and 1 more variable: diabetes <fct>
skim(diabetes_preprocessed) # no missing
Name | diabetes_preprocessed |
Number of rows | 750 |
Number of columns | 9 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 8 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
diabetes | 0 | 1 | FALSE | 2 | pos: 375, neg: 375 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
pregnant | 0 | 1 | 0.05 | 1.01 | -1.70 | -0.81 | 0.09 | 0.89 | 2.03 | ▅▆▆▇▂ |
glucose | 0 | 1 | 0.15 | 1.00 | -3.96 | -0.56 | 0.12 | 0.85 | 2.10 | ▁▁▅▇▅ |
pressure | 0 | 1 | 0.05 | 0.97 | -3.67 | -0.57 | -0.05 | 0.64 | 4.27 | ▁▃▇▂▁ |
triceps | 0 | 1 | 0.07 | 0.95 | -4.04 | -0.12 | 0.20 | 0.50 | 3.87 | ▁▂▇▂▁ |
insulin | 0 | 1 | 0.07 | 0.96 | -4.25 | 0.00 | 0.00 | 0.18 | 3.85 | ▁▁▇▂▁ |
mass | 0 | 1 | 0.10 | 0.97 | -2.58 | -0.56 | 0.14 | 0.74 | 3.52 | ▁▅▇▂▁ |
pedigree | 0 | 1 | 0.07 | 1.01 | -1.43 | -0.69 | -0.16 | 0.68 | 4.35 | ▇▆▂▁▁ |
age | 0 | 1 | 0.08 | 0.97 | -1.23 | -0.70 | -0.15 | 0.82 | 2.94 | ▇▅▅▂▁ |
diabetes_preprocessed %>%
ggpairs(aes(col = diabetes, fill = diabetes)) +
scale_color_jco() +
scale_fill_jco() +
theme_clean()
summary(diabetes_preprocessed)
pregnant glucose pressure
Min. :-1.70175 Min. :-3.9624 Min. :-3.67267
1st Qu.:-0.80595 1st Qu.:-0.5577 1st Qu.:-0.56589
Median : 0.08984 Median : 0.1243 Median :-0.04809
Mean : 0.05348 Mean : 0.1462 Mean : 0.05256
3rd Qu.: 0.89488 3rd Qu.: 0.8487 3rd Qu.: 0.64230
Max. : 2.03365 Max. : 2.1001 Max. : 4.26688
triceps insulin mass
Min. :-4.04089 Min. :-4.246593 Min. :-2.5832
1st Qu.:-0.11813 1st Qu.: 0.002261 1st Qu.:-0.5575
Median : 0.20058 Median : 0.002261 Median : 0.1438
Mean : 0.07351 Mean : 0.068787 Mean : 0.1011
3rd Qu.: 0.50040 3rd Qu.: 0.184862 3rd Qu.: 0.7433
Max. : 3.86790 Max. : 3.851857 Max. : 3.5215
pedigree age diabetes
Min. :-1.43127 Min. :-1.23380 pos:375
1st Qu.:-0.69463 1st Qu.:-0.70370 neg:375
Median :-0.16309 Median :-0.14557
Mean : 0.07227 Mean : 0.08137
3rd Qu.: 0.67897 3rd Qu.: 0.81806
Max. : 4.34502 Max. : 2.94107
X variables are less skewed. Y variable is balanced.
lr_model <-
logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
lr_workflow <- workflow () %>%
add_model(lr_model) %>%
add_recipe(diabetes_recipe)
lr_fit <- lr_workflow %>%
fit(data = diabetes_train)
lr_fit_resamples <- lr_workflow %>%
fit_resamples(
resamples = diabetes_cv,
control = control_resamples(save_pred = T)
)
collect_metrics(lr_fit_resamples)
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.752 10 0.0124 Preprocessor1_Model1
2 roc_auc binary 0.832 10 0.0133 Preprocessor1_Model1
diabetes_lr_augment <-
augment(lr_fit, diabetes_test)
diabetes_lr_augment
# A tibble: 192 × 12
pregnant glucose pressure triceps insulin mass pedigree age
* <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 85 66 29 NA 26.6 0.351 31
2 0 137 40 35 168 43.1 2.29 33
3 2 197 70 45 543 30.5 0.158 53
4 7 100 NA NA NA 30 0.484 32
5 1 115 70 30 96 34.6 0.529 32
6 11 143 94 33 146 36.6 0.254 51
7 10 125 70 26 115 31.1 0.205 41
8 7 106 92 18 NA 22.7 0.235 48
9 9 171 110 24 240 45.4 0.721 54
10 1 146 56 NA NA 29.7 0.564 29
# … with 182 more rows, and 4 more variables: diabetes <fct>,
# .pred_class <fct>, .pred_pos <dbl>, .pred_neg <dbl>
metrics <- metric_set(accuracy, roc_auc)
lr_acc <- diabetes_lr_augment %>%
accuracy(truth = diabetes, estimate = .pred_class)
lr_roc_auc <- diabetes_lr_augment %>%
roc_auc(truth = diabetes, estimate = .pred_pos)
lr_metrics <-
bind_rows(lr_acc, lr_roc_auc)
lr_metrics
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.734
2 roc_auc binary 0.861
Another way to collect metrics on test dataset:
lr_diabetes_final <- lr_workflow %>%
last_fit(diabetes_split)
lr_metrics_final <- lr_diabetes_final %>%
collect_metrics()
rf_model <-
rand_forest() %>%
set_args(mtry = tune()) %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
rf_workflow <-
workflow() %>%
add_model(rf_model) %>%
add_recipe(diabetes_recipe)
Carry out tuning based using cross-validation object. First, specify the range of mtry values to try (mtry) Add tuning layer to workflow Assess based on accuracy and roc_auc
rf_grid <- expand.grid(mtry = 3:7)
rf_tune_results <-
rf_workflow %>%
tune_grid(resamples = diabetes_cv,
grid = rf_grid,
metrics = metric_set(accuracy, roc_auc, sens))
rf_tune_results %>%
collect_metrics()
# A tibble: 15 × 7
mtry .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 3 accuracy binary 0.752 10 0.0143 Preprocessor1_Model1
2 3 roc_auc binary 0.800 10 0.0123 Preprocessor1_Model1
3 3 sens binary 0.698 10 0.0352 Preprocessor1_Model1
4 4 accuracy binary 0.743 10 0.0158 Preprocessor1_Model2
5 4 roc_auc binary 0.795 10 0.0124 Preprocessor1_Model2
6 4 sens binary 0.693 10 0.0292 Preprocessor1_Model2
7 5 accuracy binary 0.748 10 0.0139 Preprocessor1_Model3
8 5 roc_auc binary 0.793 10 0.0122 Preprocessor1_Model3
9 5 sens binary 0.698 10 0.0266 Preprocessor1_Model3
10 6 accuracy binary 0.738 10 0.0112 Preprocessor1_Model4
11 6 roc_auc binary 0.792 10 0.0113 Preprocessor1_Model4
12 6 sens binary 0.690 10 0.0296 Preprocessor1_Model4
13 7 accuracy binary 0.740 10 0.0141 Preprocessor1_Model5
14 7 roc_auc binary 0.785 10 0.0126 Preprocessor1_Model5
15 7 sens binary 0.685 10 0.0352 Preprocessor1_Model5
rf_tune_results %>%
collect_metrics() %>%
filter(.metric == "accuracy") %>%
arrange(desc(mean)) # mtry 6 has the highest accuracy
# A tibble: 5 × 7
mtry .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 3 accuracy binary 0.752 10 0.0143 Preprocessor1_Model1
2 5 accuracy binary 0.748 10 0.0139 Preprocessor1_Model3
3 4 accuracy binary 0.743 10 0.0158 Preprocessor1_Model2
4 7 accuracy binary 0.740 10 0.0141 Preprocessor1_Model5
5 6 accuracy binary 0.738 10 0.0112 Preprocessor1_Model4
rf_tune_results %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
arrange(desc(mean)) # mtry 3 has the highest roc_auc
# A tibble: 5 × 7
mtry .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 3 roc_auc binary 0.800 10 0.0123 Preprocessor1_Model1
2 4 roc_auc binary 0.795 10 0.0124 Preprocessor1_Model2
3 5 roc_auc binary 0.793 10 0.0122 Preprocessor1_Model3
4 6 roc_auc binary 0.792 10 0.0113 Preprocessor1_Model4
5 7 roc_auc binary 0.785 10 0.0126 Preprocessor1_Model5
Extract the best value for accuracy metric
finalised_rf_parameters <- rf_tune_results %>%
select_best(metric = "accuracy")
finalised_rf_parameters # mtry 6
# A tibble: 1 × 2
mtry .config
<int> <chr>
1 3 Preprocessor1_Model1
rf_final_workflow <- rf_workflow %>%
finalize_workflow(finalised_rf_parameters)
last_fit function will train model specified using training data, and produce evaluation on the test data.
rf_fit <- rf_final_workflow %>%
last_fit(diabetes_split)
rf_fit %>%
collect_metrics() # acc = 0.792, roc_auc = 0.861
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.807 Preprocessor1_Model1
2 roc_auc binary 0.872 Preprocessor1_Model1
Compared with Log Reg model, RF has higher accuracy and roc_auc
lr_metrics_final
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.734 Preprocessor1_Model1
2 roc_auc binary 0.861 Preprocessor1_Model1
First, collect the predictions from the test set.
# Collect test predictions
test_predictions <- rf_fit %>%
collect_predictions()
Then, generate the confusion matrix
test_predictions %>%
conf_mat(truth = diabetes, estimate = .pred_class)
Truth
Prediction pos neg
pos 52 22
neg 15 103
Random Forest model is better than Log Reg model.
Final model is based on the data in the combined training and testing datasets.
diabetes_final_model <- fit(rf_workflow, diabetes)
To understand more about what variables were important in determining whether patient will be positive for diabetes:
Using vip package:
diabetes_final_model %>%
extract_fit_parsnip() %>%
vip(num_features = 8) +
labs(title = "Variable Importance") +
theme_clean()
Manual extraction:
vip <- diabetes_final_model %>%
extract_fit_parsnip()
vip_variables <- as_tibble(as.list(vip$fit$variable.importance))
vip_variables %>%
pivot_longer(everything()) %>%
ggplot(aes(fct_reorder(name, value), value)) +
geom_col(fill = "deepskyblue3") +
scale_y_continuous(expand = c(0,0)) +
labs(x = "X variables/Predictors",
y = "Importance",
title = "Variable Importance for Final RF Model",
subtitle = "Glucose concentraton in blood is the most important predictor") +
coord_flip() +
theme_clean()
# create a dataset for future patient
future_data <- tribble(
~pregnant, ~glucose, ~pressure, ~triceps, ~insulin, ~mass, ~pedigree, ~age,
5, 140, 64, 50, 190, 30, 0.35, 59
)
predict(diabetes_final_model, new_data = future_data)
# A tibble: 1 × 1
.pred_class
<fct>
1 pos
https://www.rebeccabarter.com/blog/2020-03-25_machine_learning/
https://www.kaggle.com/collinsakal/diabetes-prediction-with-torch-for-r
For attribution, please cite this work as
lruolin (2021, Nov. 6). pRactice corner: Pima Indians Diabetes Dataset. Retrieved from https://lruolin.github.io/myBlog/posts/2021105 Pima Indians - Predicting Diabetes/
BibTeX citation
@misc{lruolin2021pima, author = {lruolin, }, title = {pRactice corner: Pima Indians Diabetes Dataset}, url = {https://lruolin.github.io/myBlog/posts/2021105 Pima Indians - Predicting Diabetes/}, year = {2021} }